Free spoken digit dataset

Dataset loading

In [1]:
import os
import re
import scipy.io.wavfile as wav
import numpy as np

sampling_rate = 8000

# Development dataset
dev_path = './free-spoken-digit/dev'
dev_X = []
dev_y = []

for wav_file in os.listdir(dev_path):
    dev_y.append(re.split('[._]', wav_file)[1])
    dev_X.append(wav.read(dev_path + '/' + wav_file)[1])
    if wav.read(dev_path + '/' + wav_file)[0] != sampling_rate:
        print('Sampling error in: ' + dev_path + '/' + wav_file)

# Evaluation dataset
eval_path = './free-spoken-digit/eval'
eval_X = []

for wav_file in os.listdir(eval_path):
    eval_X.append(wav.read(eval_path + '/' + wav_file)[1])
    if wav.read(eval_path + '/' + wav_file)[0] != sampling_rate:
        print('Sampling error in: ' + eval_path + '/' + wav_file)

Data exploration

In [2]:
import matplotlib.pyplot as plt

# Signals: first visualization
fig, ax = plt.subplots(2, 3, figsize=(14,4))
for i in range(2):
    for j in range(3):
        ax[i][j].plot(dev_X[i*3+j])
plt.tight_layout()
plt.show()
<Figure size 1400x400 with 6 Axes>
In [3]:
# Signal vectors size
dev_sizes = [el.size for el in dev_X]
eval_sizes = [el.size for el in eval_X]

fig, ax = plt.subplots(1,2,figsize=(13,4))
ax[0].hist(dev_sizes, bins=200)
ax[0].set_title('dev signals length')
ax[1].hist(eval_sizes, bins=200)
ax[1].set_title('eval signals length')

plt.tight_layout()
plt.show()
In [4]:
# Developement dataset
X_d = np.array(dev_X)
y_d = np.array(dev_y)

# Evaluation dataset
X_e = np.array(eval_X)
In [5]:
print(X_d.shape)
print(y_d.shape)
print(X_e.shape)
(1500,)
(1500,)
(500,)
In [6]:
# Max and min amplitude for each signal
dev_max_amp = [np.max(sig) for sig in X_d]
dev_min_amp = [np.min(sig) for sig in X_d]
eval_max_amp = [np.max(sig) for sig in X_e]
eval_min_amp = [np.min(sig) for sig in X_e]

fig, ax = plt.subplots(2,2,figsize=(13,4))
ax[0][0].hist(dev_max_amp, bins=200)
ax[0][0].set_title('dev max amplitude')
ax[0][1].hist(dev_min_amp, bins=200)
ax[0][1].set_title('dev min amplitude')

ax[1][0].hist(eval_max_amp, bins=200)
ax[1][0].set_title('eval max amplitude')
ax[1][1].hist(eval_min_amp, bins=200)
ax[1][1].set_title('eval min amplitude')

plt.tight_layout()
plt.show()
In [7]:
# Max and min amplitude within a class
sig_class = '7'

dev_max_amp = [np.max(sig) for sig in X_d[y_d == sig_class]]
dev_min_amp = [np.min(sig) for sig in X_d[y_d == sig_class]]

fig, ax = plt.subplots(1,2,figsize=(13,4))
ax[0].hist(dev_max_amp, bins=200)
ax[0].set_title('dev max amplitude')
ax[1].hist(dev_min_amp, bins=200)
ax[1].set_title('dev min amplitude')

plt.tight_layout()
plt.show()

Normalization required!

Amplitude isn't constant in the same class

In [8]:
# Amplitude rescaled to 1
for i in range(X_d.size):
    X_d[i] = X_d[i] / np.max(np.abs(X_d[i]))

for i in range(X_e.size):
    X_e[i] = X_e[i] / np.max(np.abs(X_e[i]))
In [9]:
# Max and min amplitude for each signal
dev_max_amp = [np.max(sig) for sig in X_d]
dev_min_amp = [np.min(sig) for sig in X_d]
eval_max_amp = [np.max(sig) for sig in X_e]
eval_min_amp = [np.min(sig) for sig in X_e]

fig, ax = plt.subplots(2,2,figsize=(13,4))
ax[0][0].hist(dev_max_amp, bins=200)
ax[0][0].set_title('dev max amplitude')
ax[0][1].hist(dev_min_amp, bins=200)
ax[0][1].set_title('dev min amplitude')

ax[1][0].hist(eval_max_amp, bins=200)
ax[1][0].set_title('eval max amplitude')
ax[1][1].hist(eval_min_amp, bins=200)
ax[1][1].set_title('eval min amplitude')

plt.tight_layout()
plt.show()

Estrazione della parola nel file audio

L'obiettivo è quello di trimmare l'audio all'inizio e fine esatti di quando viene pronunciata la parola.

La lunghezza di una parola è sicuramente significativa sul suo contenuto ed eccetera...

Inoltre si può iterare questa tecnica per isolare delle sotto parti dell'onda (da provare).

Cumulative distribution function

In [10]:
def CDF(v):
    """Given a vector, returns the CDF vector of it"""
    cdf = np.zeros(v.size)
    cdf[0] = v[0]
    for i in range(1,v.size):
        cdf[i] = cdf[i-1] + v[i] 
    return cdf / np.sum(v)

Alte funzioni utili per trimmare

In [11]:
def lower_cdf_area(cdf, threshold=.01):
    """Ritorna l'indice della cdf per cui l'area a sx vale 'threshold'"""
    area_tot = np.sum(cdf)
    area = area_tot * threshold
    
    lower = 0
    upper = cdf.size - 1
    
    # Lower area
    a = 0
    for i in range(cdf.size):
        if a >= area/2:
            break
        a += cdf[i]   
        
    return i
In [12]:
def lower_cdf_perc(cdf, threshold=.01):
    """Ritorna l'indice della cdf per cui essa vale 'threshold'"""
    i = 0
    while cdf[i] < threshold and i < cdf.size:
        i += 1
    return i

Moving average

E' molto efficace per ridurre il sumore su un segnale se calcolata con finistra piccola

In [13]:
def moving_average(a, n=10):
    "Media mobile"
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n
In [14]:
# Es
fig, ax = plt.subplots(1,2,figsize=(10,4))
ax[0].plot(X_d[0])
ax[0].set_title('Segnale originale')
ax[1].plot(moving_average(X_d[0],10))
ax[1].set_title('Media mobile (finestra = 10)')

plt.tight_layout()
plt.show()

Pipeline: trim del segnale

  1. Pulizia del segnale mediante il calcolo della media mobile con finestra = 10
  2. Modulo della media mobile e filtraggio di tutte le componenti con ampiezza minore del 12%. Questo numero si ottiene facendo tuning e osservando che è buono perchè non riduce la parte parlata, ma rimuove il rumore residuo.
  3. Calcolo della CDF della funzione ottenuta
  4. Il segnale viene troncato per gli indici in cui la sua CDF è minore o maggiore di .0025 (via tuning).
In [15]:
# Es pipeline

fig, ax = plt.subplots(4,1,figsize=(12,15))

# Segnale
sig = X_d[5] #5

# moving avg
mov_avg = moving_average(sig, 10)
ax[0].plot(sig)
ax[0].plot(mov_avg, linewidth=2)
ax[0].set_title('Media mobile')

# abs(mov avg) filtered
mov_avg_abs = np.abs(mov_avg)
mov_avg_abs = mov_avg_abs/np.max(mov_avg_abs)
mov_avg_abs = np.where(mov_avg_abs < .12, 0, mov_avg_abs) #.1 buono
ax[1].plot(mov_avg_abs)
ax[1].set_title('Media mobile filtrata')

# cdf
cdf = CDF(mov_avg_abs)
ax[2].plot(cdf)
ax[2].set_title('CDF')

#upper & lower percentuali
lower_p = lower_cdf_perc(cdf, threshold=.0025)
upper_p = cdf.size - 1 - lower_cdf_perc(np.flip(1-cdf), threshold=.0025)

# segnale trimmato
ax[3].plot(sig)
ax[3].plot(lower_p, 0, '|r', markersize=100)
ax[3].plot(upper_p, 0, '|r', markersize=100)
ax[3].set_title('Trim')

plt.tight_layout()
plt.show()
In [16]:
def upper_lower(sig):
    # moving avg
    mov_avg = moving_average(sig, 10)
    
    # abs(mov avg) filtered
    mov_avg_abs = np.abs(mov_avg)
    mov_avg_abs = mov_avg_abs/np.max(mov_avg_abs)
    mov_avg_abs = np.where(mov_avg_abs < .12, 0, mov_avg_abs) #.1 buono
    
    # cdf
    cdf = CDF(mov_avg_abs)
    
    # upper & lower aree
    #lower = lower_cdf_area(cdf, threshold=1e-5)
    #upper = cdf.size - 1 - lower_cdf_area(np.flip(1-cdf), threshold=31e-5)  #3e-4 buono, 32e-5 troppo
    
    # Si osserva che upper & lower percentuali è più flessibile della versione sulle aree!
    
    # upper & lower percentuali
    lower = lower_cdf_perc(cdf, threshold=.0025)
    upper = cdf.size - 1 - lower_cdf_perc(np.flip(1-cdf), threshold=.0025)
    
    return upper,lower

Esempio su 50 segnali

In [17]:
n_sig = 50
n_grafici_per_sig = 1
n_plot = n_sig * n_grafici_per_sig
fig, ax = plt.subplots(n_plot,1,figsize=(10,4*n_plot))

for i in range(0,n_plot,n_grafici_per_sig):
    sig = X_d[i//n_grafici_per_sig]
    
    upper, lower = upper_lower(sig)
    
    # segnale trimmato
    ax[i].plot(sig)
    ax[i].plot(lower, 0, '|r', markersize=100)
    ax[i].plot(upper, 0, '|r', markersize=100)
    ax[i].set_title('Signal: ' + str(i))

plt.tight_layout()
plt.show()
In [18]:
fig, ax = plt.subplots(4,1,figsize=(15,15))
sig = dev_X[49]
sig = sig / np.max(np.abs(sig))

ax[0].plot(sig)

sq = sig**2
sq = sq / np.max(sq)
sq = np.where(sq < .02, 0, sq)
ax[1].plot(sq)


cdf = CDF(sq)
ax[2].plot(cdf)

thre = .0002
lower = lower_cdf_perc(cdf, threshold=.0002)
upper = cdf.size - 1 - lower_cdf_perc(np.flip(1-cdf), threshold=.002)

ax[3].plot(sig)
ax[3].plot(lower, 0, '|r', markersize=100)
ax[3].plot(upper, 0, '|r', markersize=100)

plt.tight_layout()
plt.plot();

Trim del dataset

In [19]:
for i in range(X_d.size):
    upp, low = upper_lower(X_d[i])
    X_d[i] = X_d[i][low:upp+1]
In [20]:
for i in range(X_e.size):
    upp, low = upper_lower(X_e[i])
    X_e[i] = X_e[i][low:upp+1]
In [21]:
plt.plot(X_d[7]);

Controllo la lunghezza dei segnali e filtro eventuali anomali

In [22]:
# Signal vectors size
dev_sizes = [el.size for el in X_d]
eval_sizes = [el.size for el in X_e]

fig, ax = plt.subplots(1,2,figsize=(13,4))
ax[0].hist(dev_sizes, bins=200)
ax[0].set_title('dev signals length')
ax[1].hist(eval_sizes, bins=200)
ax[1].set_title('eval signals length')

plt.tight_layout()
plt.show()
In [23]:
# Com'è possibile che ci siano ancora segnali lunghi più di 4000?
i_lunghi = sorted([(i,X_d[i].size) for i in range(X_d.size) if X_d[i].size > 4000], key=lambda x: -x[1])
i_lunghi = list(map(lambda x:x[0],i_lunghi))
lunghi = len(i_lunghi)

fig, ax = plt.subplots(lunghi,1,figsize=(13,4*lunghi))

for j,i in enumerate(i_lunghi):
    u,l = upper_lower(dev_X[i])
    ax[j].plot(dev_X[i])
    ax[j].plot(l,0,'r|',markersize=100)
    ax[j].plot(u,0,'r|',markersize=100)
    ax[j].set_title(i)
    
plt.tight_layout()
plt.plot();

Scarto i segnali più lunghi di 4500

In [24]:
# Conteggio scartati
max_size = 4500
print(f'dev X scartati: {len([el.size for el in X_d if el.size > max_size])/len(X_d)*100:.2f} %')
print(f'eval X scartati: {len([el.size for el in X_e if el.size > max_size])/len(X_e)*100:.2f} %')
dev X scartati: 1.40 %
eval X scartati: 0.60 %
In [25]:
# Filtraggio
dev_idx = [i for i in range(X_d.size) if X_d[i].size > max_size]
X_d = np.delete(X_d, dev_idx)
y_d = np.delete(y_d, dev_idx)
X_e = np.delete(X_e, [i for i in range(X_e.size) if X_e[i].size > max_size])
In [26]:
# Signal vectors size
dev_sizes = [el.size for el in X_d]
eval_sizes = [el.size for el in X_e]

fig, ax = plt.subplots(1,2,figsize=(13,4))
ax[0].hist(dev_sizes, bins=200)
ax[0].set_title('dev signals length')
ax[1].hist(eval_sizes, bins=200)
ax[1].set_title('eval signals length')

plt.tight_layout()
plt.show()

Identificazione delle principali 'gobbe' del segnale

In [27]:
fig, ax = plt.subplots(7,1,figsize=(15,27))
sig = dev_X[829] #dev_X[i_lunghi[3]]
sig = sig / np.max(np.abs(sig))

win = sig.size // 10

ax[0].plot(sig)

#mov_avg = moving_average(sig,10)
#mov_avg = mov_avg/np.max(np.abs(mov_avg))

# è il miogliore perchè non attenua troppo le 'gobbe' minori, ma sempre importanti (vedi dev_X[688])
ax[1].plot(sig**2)

ax[2].plot(np.convolve(sig**2,np.ones(win), mode='same'))

#mov_avg = mov_avg**3#moving_average(np.abs(sig),10)
ax[3].plot(sig**3)

ax[4].plot(np.convolve(np.abs(sig**3),np.ones(win), mode='same'))

ax[5].plot(sig**4)

ax[6].plot(np.convolve(sig**4,np.ones(win), mode='same'))

plt.tight_layout()
plt.plot();

Funzioni utili

In [28]:
def zero_crossing_idx(v):
    tol = 1e-8
    idx = []
    i = 0
    
    while np.abs(v[i]) < tol and i < v.size:
        i += 1
    prec = v[i]
    
    for j in range(i+1,v.size):
        if np.abs(v[j]) < tol:
            continue
            
        if v[j] * prec < 0:
            idx.append(j)
        prec = v[j]
    return idx
In [29]:
def der(sig, sampl_rate=8000):
    t = 1/sampl_rate
    d = np.zeros(sig.size - 1)
    for i in range(sig.size - 1):
        d[i] = (sig[i+1]-sig[i])/t
    return d
In [30]:
def staz(sig):
    """Calcola i punti stazionari del segnale"""
    return zero_crossing_idx(der(sig))
In [31]:
def massimi(sig, window=None):
    """Calcola i massimi del segnale"""
    if window is None:
        window = sig.size // 30
        
    d = der(sig)
    st = staz(sig)
    m = []
    
    for s in st:
        # media della derivata a sx (in una certa finestra)
        d_sx = np.average(d[s-window:s])
        # media della derivata a dx
        d_dx = np.average(d[s:s+window])
        
        if d_sx > 0 and d_dx < 0:
            m.append(s)
    
    # prima è massimi più grandi
    m = sorted(m,key=lambda x: -sig[x])
    
    # filtraggio massimi troppo vicini
    window = 300 #sig.size // 30
    m_filt = []
    m_filt.append(m[0])
    for i in m:
        if len([j for j in m_filt if np.abs(i - j) > window]) == len(m_filt):
            m_filt.append(i)
    return m_filt

Pipeline calcolo gobbe

  1. Normalizzazione del segnale alla sua ampiezza massima
  2. Calcolo del quadrato del segnale: permette di enfatizzare le 'gobbe'. Potenze maggiori (3,4,5) in alcuni casi sono troppo aggressive e stravolgono il segnale. Il quadrato è il migliore perchè non attenua troppo le 'gobbe' minori, ma sempre importanti (vedi dev_X[688]).
  3. Convoluzione per una 'porta' di ampiezza e altezza 1.3.
  4. Smoothing del segnale ottenuto: Trasformata di fourier e filtraggio della stessa. Il filtraggio è fatto sia tagliando le frequenze più altre che le frequenza con potenza inferiore al 10%
  5. Ricostruzione del segnale
  6. Calcolo dei massimi. Vengono filtrati quelli più vicini di $300\,T_{c}$ secondi e quelli sotto al 20% della potenza

Dopo aver osservato i risultati ho notato che è meglio fare il segnale alla quarta e non al quadrato

In [32]:
from scipy import fftpack

fig, ax = plt.subplots(4,1,figsize=(15,15))
sig = X_d[0] #dev_X[i_lunghi[3]] #621
sig = sig / np.max(np.abs(sig))

win = sig.size // 10

ax[0].plot(sig)
ax[0].set_title('Segnale originale')

ax[1].plot(sig**2)
ax[1].set_title('Segnale al quadrato')

conv = np.convolve(sig**2,np.ones(win), mode='same')

ax[2].plot(conv)
ax[2].set_title('Convoluzione x porta')

# Trasformata fi fourier
fft = fftpack.fft(conv)
freq = fftpack.fftfreq(fft.size, d=1/8000)

# filtro potenza minima in frequenza
spectrum = np.abs(fft)#fft**2
filt_fft = fft.copy()
#cutoff_idx = spectrum < (spectrum.max()/10)
#cutoff_idx = spectrum < (spectrum.max()/50)
#filt_fft[cutoff_idx] = 0

# filtro passa basso
t = 10
filt_fft[t:filt_fft.size//2] = 0
filt_fft[filt_fft.size//2 +t:] = 0

# Segnale ricostruito
cv = np.real(fftpack.ifft(filt_fft))
cv = (cv - np.min(cv))
cv = cv / np.max(cv)

i_max = massimi(cv)
i_max = [i for i in i_max if cv[i] > .2]

ax[3].plot(cv)
ax[3].plot(i_max,cv[i_max],'o')
ax[3].set_title('Convoluzione ricostruita\nMassimi')

plt.tight_layout()
plt.plot();
In [33]:
def massimi_potenza(sig, window_max=None):
    """Calcola la posizione delle principali 'gobbe' del segnale.
    Ritorna le posizioni in base all'importanza del massimo.
    """
    
    # Normalizzazione
    sig = sig / np.max(np.abs(sig))

    win = sig.size // 10
    
    # Convoluzione del segnale alla quarta
    conv = np.convolve(sig**4,np.ones(win), mode='same')

    # Trasformata fi fourier
    fft = fftpack.fft(conv)
    freq = fftpack.fftfreq(fft.size, d=1/8000)

    # filtro potenza minima in frequenza
    spectrum = np.abs(fft)
    filt_fft = fft.copy()
    #cutoff_idx = spectrum < (spectrum.max()/10)
    #cutoff_idx = spectrum < (spectrum.max()/50)
    #filt_fft[cutoff_idx] = 0

    # filtro passa basso
    t = 10
    filt_fft[t:filt_fft.size//2] = 0
    filt_fft[filt_fft.size//2 +t:] = 0

    # Segnale ricostruito
    cv = np.real(fftpack.ifft(filt_fft))
    cv = (cv - np.min(cv))
    cv = cv / np.max(cv)

    i_max = massimi(cv,window=window_max)
    i_max = [i for i in i_max if cv[i] > .2]
    
    # Sort per importanza
    i_max = sorted(i_max,key=lambda x: -cv[x])
    
    return np.array(i_max)
In [34]:
n_plot = 100
fig, ax = plt.subplots(n_plot,1,figsize=(15,4*n_plot))

for i in range(n_plot):
    sig = X_d[i]
    mas = massimi_potenza(sig)
    ax[i].plot(sig)
    ax[i].plot(mas[1:],np.zeros(len(mas)-1),'o',markersize=20)
    # Il più alto
    ax[i].plot(mas[0],0,'*', markersize=25)
C:\Users\mette\Anaconda3\lib\site-packages\numpy\lib\function_base.py:392: RuntimeWarning: Mean of empty slice.
  avg = a.mean(axis)
C:\Users\mette\Anaconda3\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

Features extraction

Alcune features che posso estrarre sono:

  • Durata del segnale nel tempo
  • Percentili / CDF del segnale nel tempo
  • Percentili / CDF del segnale nella frequenza

Zero crossing rate

In [35]:
def zero_crossing_rate(v):
    count = 0
    i = 0
    
    while v[i] == 0 and i < v.size:
        i += 1
    prec = v[i]
    
    for j in range(i+1,v.size):
        if v[j] == 0:
            continue
            
        if v[j] * prec < 0:
            count += 1
        prec = v[j]
    return count
In [36]:
# Example
my_array = np.array([0,0,-4,0,0,5,4,-1]) # -> 2

print(zero_crossing_rate(my_array))
plt.plot(my_array);
2

Frequency domain

In [37]:
from scipy import fftpack

def freq_domain(sig, sampling_rate=8000, cutoff=.2):
    """
    Return: bilateral_frequencies, bilateral_amplitude, bilateral_filtered_amplitude
    Note: power = np.abs(bilateral_amplitude)
          positive_frequencies = bilateral_frequencies[:bilateral_frequencies.size // 2]
    """
    time_step = 1/sampling_rate

    # The FFT of the signal
    sig_fft = fftpack.fft(sig)

    # And the power (sig_fft is of complex dtype)
    power = np.abs(sig_fft)

    # The corresponding frequencies
    sample_freq = fftpack.fftfreq(sig.size, d=time_step)
    
    # Removing high frequencies
    high_freq_fft = sig_fft.copy()
    high_freq_fft[(power / np.max(power)) < cutoff] = 0
    
    return sample_freq, sig_fft, high_freq_fft

Autocorrelation: may reduce noise?

Autocorrelation is useful for finding repeating patterns in a signal, such as determining the presence of a periodic signal which has been buried under noise, or identifying the fundamental frequency of a signal which doesn't actually contain that frequency component, but implies it with many harmonic frequencies (Wikipedia 2006).

Hence: Autocorrelation is good to remove noise in the frequency domain

In [38]:
def autocorrelation(x):
    autocorr = np.correlate(x, x, mode='full')
    autocorr = autocorr[autocorr.size//2:]
    autocorr = autocorr / np.max(autocorr)
    return autocorr
In [39]:
# Example
x = X_d[2]
autocorr = np.correlate(x, x, mode='full')
autocorr = autocorr[autocorr.size//2:]
autocorr = autocorr / np.max(autocorr)

fig, ax = plt.subplots(1,2,figsize=(14,5))

ax[0].plot(x)
ax[1].plot(autocorr)

plt.show()

Average frequency domain per class

In [40]:
import seaborn as sns

fig, ax = plt.subplots(10, 2, figsize=(14, 30))

for cl in range(10):
    class_lab = str(cl)
    signals = X_d[y_d == class_lab]

    n_sig = 300

    mean_pot_sig = None

    for i, sig in enumerate(signals[:n_sig]):
        
        # Considero l'autocorrelazione
        sig = autocorrelation(sig)
        
        freq, freq_amp, filt_freq_amp = freq_domain(sig, sampling_rate)

        # Potenza
        filt_pot = np.abs(filt_freq_amp)
        pot = np.abs(freq_amp)

        mean_pot_sig = mean_pot_sig + filt_pot / n_sig if mean_pot_sig is not None else filt_pot / n_sig
        #mean_pot_sig = mean_pot_sig + filt_pot / n_sig if mean_pot_sig is not None else filt_pot / n_sig
    
    # Log transformation of signal power
    #mean_pot_sig = np.log(np.where(mean_pot_sig == 0, 0.1, mean_pot_sig))
    
    ax[cl][0].plot(freq[:mean_pot_sig.size//2], mean_pot_sig[:mean_pot_sig.size//2])
    ax[cl][0].set_title('Classe ' + class_lab + ': potenza media')
    
    sns.heatmap([mean_pot_sig[:2000]], ax=ax[cl][1]); 
    #sns.heatmap([mean_pot_sig[:mean_pot_sig.size//2]], ax=ax[cl][1]);
    ax[cl][1].set_title('Classe ' + class_lab + ': potenza heatmap')
        
    
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-4adfa1e8d0f9> in <module>
     22         pot = np.abs(freq_amp)
     23 
---> 24         mean_pot_sig = mean_pot_sig + filt_pot / n_sig if mean_pot_sig is not None else filt_pot / n_sig
     25         #mean_pot_sig = mean_pot_sig + filt_pot / n_sig if mean_pot_sig is not None else filt_pot / n_sig
     26 

ValueError: operands could not be broadcast together with shapes (2258,) (2685,) 

Spectral centroid

Spectral centroid: \begin{equation} C = \frac{\sum_i f_i\cdot |A(f_i)|}{\sum_j |A(f_j)|} = \sum_i f_i \cdot \frac{|A(f_i)|}{\sum_j |A(f_j)|} \end{equation} Dove:

  • $f_i$ è la frequenza i-esima.
  • $A(f_i)$ è l'intensità della frequenza i-esima.
  • $|A(f_i)|$ è la potenza della frequenza i-esima.

Osservazione:

  • $\frac{|A(f_i)|}{\sum_j |A(f_j)|} > 0 \,\, \forall i$
  • $\sum_i \frac{|A(f_i)|}{\sum_j |A(f_j)|} = 1$

Dunque $\frac{|A(f_i)|}{\sum_j |A(f_j)|}$ si può considerare come la distribuzione di probabilità di una variabile aleatoria discreta $f$ e $C$ è il valore atteso.

Posso quindi calcolare anche la varianza di tale variabile aleatoria con la formula:

\begin{equation} V = E(f^2) - C^2 = E(f^2) - E(f)^2 = \sum_i f_i^2 \cdot \frac{|A(f_i)|}{\sum_j |A(f_j)|} - \left(\sum_i f_i \cdot \frac{|A(f_i)|}{\sum_j |A(f_j)|}\right)^2 \end{equation}
In [41]:
def spectral_centroid(f, pot):
    return np.sum(f * pot) / np.sum(pot)
In [42]:
def spectral_centroid_var(f, pot, centroid=None):
    """Spectral centroid variance"""
    if centroid is None:
        centroid = spectral_centroid(f, pot)
    return np.sum(f**2 * pot) / np.sum(pot) - centroid
In [43]:
def spectral_info(sig):
    """
    Ritorna lo spectral centroid e la deviazione standard, 
    considerando la potenza come una distribuzione di probabilità
    """
    
    # Pulisco lo spettro del segnale
    sig = autocorrelation(sig)
    
    # Fourier transform
    fft = fftpack.fft(sig)
    freq = fftpack.fftfreq(fft.size, d=1/8000)
    
    # Filtraggio delle frequenze a intensità minore
    pot = np.abs(fft)
    fft = np.where(pot/np.max(pot) < .1, 0, fft)
    
    # Cut: solo le freq positive
    fft = fft[:fft.size//2]
    freq = freq[:freq.size//2]
    
    centroid = spectral_centroid(freq, np.abs(fft))
    std_dev_spectrum = np.sqrt(spectral_centroid_var(freq, np.abs(fft),centroid))
    
    return centroid, std_dev_spectrum
In [44]:
spectral_info(X_d[0])
Out[44]:
(527.0715538297464, 557.2339078239443)

Queste due misure sono migliori a identificare la classe quando lo spettro è filtrato o no?

Autocorrelation in the same class label

In [45]:
class_lab = '9'
signals = X_d[y_d == class_lab]

n_sig = 5
fig, ax = plt.subplots(2*n_sig, 2, figsize=(13, 40))

mean_pot_sig = None

for i, sig in zip(range(0,2*n_sig,2),signals[:n_sig]):
    
    # Original signal: FFT
    freq, f_amp, f_filt_amp = freq_domain(sig, cutoff=.1)
    f = freq[:freq.size // 2]
    f_a = f_amp[:f_amp.size // 2]
    f_filt_a = f_filt_amp[:f_filt_amp.size // 2]
    
    # Spettro completo
    ax[i][0].plot(f, np.abs(f_a))
    ax[i][0].plot(spectral_centroid(f, np.abs(f_a)), 0, 'o') # Spectral centroid
    ax[i][0].set_title(str(i//2+1) + ': Segnale originale')
    
    # Spettro filtrato
    ax[i+1][0].plot(f, np.abs(f_filt_a))
    ax[i+1][0].plot(spectral_centroid(f, np.abs(f_filt_a)), 0, 'o') # Spectral centroid
    ax[i+1][0].set_title(str(i//2+1) + ': Segnale originale (spettro filtrato)')
    
    
    # Autocorrelation: FFT
    freq_a, f_amp_a, f_filt_amp_a = freq_domain(autocorrelation(sig), cutoff=.1)
    f_aut = freq_a[:freq_a.size // 2]
    f_a_aut = f_amp_a[:f_amp_a.size // 2]
    f_filt_a_aut = f_filt_amp_a[:f_filt_amp_a.size //2]
    
    # Spettro completo
    ax[i][1].plot(f_aut, np.abs(f_a_aut))
    ax[i][1].plot(spectral_centroid(f_aut, np.abs(f_a_aut)), 0, 'o') # Spectral centroid
    ax[i][1].set_title(str(i//2+1) + ': Autocorrelazione' \
                       + '\nDev st: ' + str(np.sqrt(spectral_centroid_var(f_aut, np.abs(f_a_aut)))))
    
    # Spettro filtrato
    ax[i+1][1].plot(f_aut, np.abs(f_filt_a_aut))
    ax[i+1][1].plot(spectral_centroid(f_aut, np.abs(f_filt_a_aut)), 0, 'o') # Spectral centroid
    ax[i+1][1].set_title(str(i//2+1) + ': Autocorrelazione (spettro filtrato)' \
                       + '\nDev st: ' + str(np.sqrt(spectral_centroid_var(f_aut, np.abs(f_filt_a_aut)))))
    
plt.tight_layout()
plt.show()

Si osserva come il comportamento in frequenza dell'autocorrelazione sia molto meno rumoroso rispetto a quello del segnale originale.

Il centroide in frequenza individua correttamente le frequenze più importanti se:

  1. Lo spettro è quello dell'autocorrelazione del segnale. Dunque è ripulito dal rumore
  2. Lo spettro è filtrato con un cutoff = 0.1.

Next step: calcolare il centroide in frequenza per ciascun segnale e plottarne la distribuzione di ogni classe. Valutare la media e la varianza per ciascuna classe. Questa procedura è una semplificazione dello spettro in fequenza.

Percentili

Riguardano la distribuzione delle ampiezze di un segnale. Posso prenderne 10 per ogni segnale ad esempio: range(5,96,10).

In [46]:
def time_percentiles(sig,p):
    perc = []
    for i in p:
        perc.append(np.percentile(sig,i))
    return np.array(perc)
In [47]:
def freq_percentiles(sig,p):
    fft = fftpack.fft(sig)
    fft = np.abs(fft[:fft.size//2])
    perc = []
    for i in p:
        perc.append(np.percentile(fft,i))
    return np.array(perc)
In [48]:
# Andamento dei percentili dei primi n segnali
fig, ax = plt.subplots()
for j in range(10):
    sig = X_d[j] # np.abs(X_d[j])
    perc = []
    for i in range(100):
        perc.append(np.percentile(sig,i))
    ax.plot(perc)
In [49]:
# Andamento dei percentili della potenza dei primi n segnali
fig, ax = plt.subplots()
for j in range(10):
    sig = X_d[j]
    fft = fftpack.fft(sig)
    fft = np.abs(fft[:fft.size//2])
    perc = []
    for i in range(100):
        perc.append(np.percentile(fft,i))
    ax.plot(perc)

Gobbe dello spettro in frequenza: non funziona bene come nel tempo. Cestinato

In [50]:
def massimi_potenza_freq(sig, window_max=None):
    """Calcola la posizione delle principali 'gobbe' del segnale"""
    
    # Normalizzazione
    sig = sig / np.max(np.abs(sig))

    win = sig.size // 50
    
    # Convoluzione del segnale alla quarta
    conv = np.convolve(sig,np.ones(win), mode='same')

    # Trasformata fi fourier
    fft = fftpack.fft(conv)
    freq = fftpack.fftfreq(fft.size, d=1/8000)

    # filtro potenza minima in frequenza
    spectrum = np.abs(fft)
    filt_fft = fft.copy()
    #cutoff_idx = spectrum < (spectrum.max()/10)
    #cutoff_idx = spectrum < (spectrum.max()/50)
    #filt_fft[cutoff_idx] = 0

    # filtro passa basso
    t = 10
    filt_fft[t:filt_fft.size//2] = 0
    filt_fft[filt_fft.size//2 +t:] = 0

    # Segnale ricostruito
    cv = np.real(fftpack.ifft(filt_fft))
    cv = (cv - np.min(cv))
    cv = cv / np.max(cv)

    i_max = massimi(cv,window=window_max)
    i_max = [i for i in i_max if cv[i] > .2]
    
    return np.array(i_max)
In [51]:
n_plot = 5
fig, ax = plt.subplots(n_plot,1,figsize=(15,4*n_plot))

for i in range(n_plot):
    sig = autocorrelation(X_d[i])
    
    fft = fftpack.fft(sig)
    freq = fftpack.fftfreq(fft.size, d=1/8000)
    sig = np.abs(fft[:fft.size//2])
    
    mas = massimi_potenza_freq(sig,30)
    ax[i].plot(freq[:freq.size//2],sig)
    l = freq[freq.size//2] / sig.size
    ax[i].plot(l*mas,np.zeros(len(mas)),'o',markersize=10)
    ax[i].plot(spectral_centroid(freq[:freq.size//2],np.where(sig/np.max(sig) < .1, 0, sig)),0,'x',markersize=10)
C:\Users\mette\Anaconda3\lib\site-packages\numpy\lib\function_base.py:392: RuntimeWarning: Mean of empty slice.
  avg = a.mean(axis)
C:\Users\mette\Anaconda3\lib\site-packages\numpy\core\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

Spectral Rolloff

It is a measure of the shape of the signal. It represents the frequency below which a specified percentage of the total spectral energy, e.g. 85%, lies.

In pratica è una versione della banda al x% di energia.

In [52]:
def spectral_rolloff(sig, threshold=.9):
    """Return the spectral rolloff"""
    if threshold > 1:
        threshold = 1
        
    fft = fftpack.fft(sig)
    freq = fftpack.fftfreq(fft.size, d=1/8000)
    
    fft = fft[:fft.size//2]
    freq = freq[:freq.size//2]
    
    pot = np.abs(fft)**2
    pot = pot / np.sum(pot)
    s = 0
    i = 0
    while s < threshold:
        s += pot[i]
        i += 1
    return freq[i]
In [53]:
n_plot = 10
fig, ax = plt.subplots(n_plot,1,figsize=(15,4*n_plot))

for i in range(n_plot):
    sig = autocorrelation(X_d[i])
    
    fft = fftpack.fft(sig)
    freq = fftpack.fftfreq(fft.size, d=1/8000)
    fft = np.abs(fft[:fft.size//2])
    
    ax[i].plot(freq[:freq.size//2],fft)
    ax[i].plot(spectral_centroid(freq[:freq.size//2],np.where(fft/np.max(fft) < .1, 0, fft)),0,'x',markersize=10)
    ax[i].plot(spectral_rolloff(sig),0,'|',markersize=10)

Rise time

Il tempo che il segnale impiega a passare dal 10% al 90% della sua energia. E' interessante perchè i segnali più impulivi avranno un tempo di salita piccolo e viceversa

In [54]:
def rise_time(sig, T=1/8000):
    sig = sig**2
    sig = sig / np.sum(sig)
    lower = 0
    upper = sig.size-1
    s = 0
    for i in range(sig.size):
        lower = i if s < .1 else lower
        upper = i if 1-s > .1 else upper
        s += sig[i]
    return (upper-lower)*T #upper,lower
In [56]:
n_plot = 10
fig, ax = plt.subplots(n_plot,1,figsize=(15,4*n_plot))

for i in range(n_plot):
    sig = X_d[i]
    u,l = rise_time(sig)
    ax[i].plot(sig)
    ax[i].plot(l,0,'|',markersize=100)
    ax[i].plot(u,0,'|',markersize=100)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-56-71ea3387acf2> in <module>
      4 for i in range(n_plot):
      5     sig = X_d[i]
----> 6     u,l = rise_time(sig)
      7     ax[i].plot(sig)
      8     ax[i].plot(l,0,'|',markersize=100)

TypeError: cannot unpack non-iterable float object

centro di energia

Allo stesso modo posso calcolare il punto in cui l'energia raggiunge il 50%. Il tempo che un segnale impiega a raggiungere il 50% di energia è una casatteristica assoluta nel segnale (tempo dall'inizio). Il rise_time() invece è una carattereistica indipendente dalla posizione nel segnale.

In [55]:
def centro_energia(sig):
    sig = sig**2
    sig = sig / np.sum(sig)
    s = 0
    i = 0
    while s < .5 and i < sig.size:
        s += sig[i]
        i += 1
    return i
In [56]:
n_plot = 10
fig, ax = plt.subplots(n_plot,1,figsize=(15,4*n_plot))

for i in range(n_plot):
    sig = X_d[i]
    u = centro_energia(sig)
    ax[i].plot(sig)
    ax[i].plot(u,0,'|',markersize=100)

Percentuale di energia

Estensione del concetto di centro ti energia: il centro è il 50% dell'energia

In [57]:
def perc_energia(sig, p):
    """Ritorna l'indice del segnale al quale la percentuale di energia vale 'p', in (0,1)"""
    sig = sig**2
    sig = sig / np.sum(sig)
    s = 0
    i = 0
    while s < p and i < sig.size:
        s += sig[i]
        i += 1
    return i

Minima distanza tra due gobbe

In [58]:
def min_dist(sig):
    if sig.size < 2:
        return 0
    
    sig = np.sort(sig)
    return min([np.abs(sig[i]-sig[i-1]) for i in range(1,sig.size)])

Framing del segnale

Una tecnica molto usata è quella di 'spezzare' il segnale e analizzare ciascun pezzo. Farò una prova calcolando la percentuale di energia per ogni frame.

In [59]:
def power(sig):
    """Potenza del segnale"""
    return np.abs(sig)**2
In [60]:
def ener_dist(sig, slots=10):
    """Distribuzione di energia"""
    power = np.abs(sig)**2
    dist = np.zeros(slots)
    slot_length = sig.size//(slots-1)
    
    for i in range(0,sig.size,slot_length):
        s = np.sum(power[i:i+slot_length])
        dist[i//slot_length] = s
        
    # Avanzi
    dist[-1] += np.sum(power[-(sig.size % slots)])
    
    return dist / np.sum(power)
In [61]:
fig, ax = plt.subplots(figsize=(14,4))
ax.plot(ener_dist(X_d[1]));

Features extraction

In [62]:
import scipy.stats as stats

def extract_features(sig):
    f = dict()
    t = 1/8000
    
    # TIME 
    
    # Signal length
    f['length'] = sig.size * t
    
    # Amplitude: mean, stddev, min, max
    f['avg_amp'] = np.average(sig)
    f['stddev_amp'] = np.std(sig)
    f['max_amp'] = np.max(sig)
    f['min_amp'] = np.min(sig)
    
    # abs(Amplitude): mean, stddev, min, max
    f['avg_abs_amp'] = np.average(np.abs(sig))
    f['stddev_abs_amp'] = np.std(np.abs(sig))
    f['max_abs_amp'] = np.max(np.abs(sig))
    f['min_abs_amp'] = np.min(np.abs(sig))
    
    # Signal kurtosis
    f['sig_kurtosis'] = stats.kurtosis(sig)
    
    # Signal skewness
    f['sig_skew'] = stats.skew(sig)
    
    # Linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(t*np.array(range(sig.size)),sig)
    f['sig_linreg_slope'] = slope
    f['sig_linreg_intercept'] = intercept
    f['sig_linreg_r_value'] = r_value
    f['sig_linreg_p_value'] = p_value
    f['sig_linreg_std_err'] = std_err
    
    # 'Gobbe'
    massimi = massimi_potenza(sig) # indici di sig corrispondenti ai massimi
    f['n_gob'] = massimi.size
    f['min_dist_gob'] = min_dist(massimi) * t
    f['best_gob'] = massimi[0] * t # è la gobba più alta
    
    # Zero crossing rate
    f['zero_crossing_rate'] = zero_crossing_rate(sig)
    
    # Rise time
    f['rise_time'] = rise_time(sig)
    
    # Energy center
    f['ener_center'] = centro_energia(sig) * t
    
    # Percentuale di energia
    p = range(5,96,10) # percentuali
    for i in p:
        f['ener_' + str(i) + '_perc'] = perc_energia(sig,i/100) * t
    
    #Energy distribution (%): riassume la forma del segnale
    slots = 10
    ener_d = ener_dist(sig, slots)
    for i,en in enumerate(ener_d):
        f['ener_dist_' + str(i+1)] = en
    
    # Numero di slot con % energia superiore alla media
    f['top_slots'] = len([i for i in ener_d if i > np.average(ener_d)]) / slots
    
    
    # FREQUENCY
    
    # FFT
    fft = fftpack.fft(sig)
    freq = fftpack.fftfreq(fft.size, d=t)
    power = np.abs(fft[:fft.size//2])
    
    # Power base statistics
    f['f_avg_power'] = np.average(power)
    f['f_stddev_power'] = np.std(power)
    f['f_min_power'] = np.min(power)
    f['f_max_power'] = np.max(power)
    
    # Power kurtosis
    f['f_power_kurtosis'] = stats.kurtosis(power)
    
    # Signal skewness
    f['f_power_skew'] = stats.skew(power)
    
    # Linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(freq[:freq.size//2],power)
    f['f_power_linreg_slope'] = slope
    f['f_power_linreg_intercept'] = intercept
    f['f_power_linreg_r_value'] = r_value
    f['f_power_linreg_p_value'] = p_value
    f['f_power_linreg_std_err'] = std_err
    
    # Spectral centroid & Spectrum stddev
    sp_centroid, sp_std = spectral_info(sig)
    f['spec_centroid'] = sp_centroid
    f['stddev_spec'] = sp_std
    
    # Spectral rolloff (freq. sotto la quale ho il 90% di energia)
    f['spec_rolloff'] = spectral_rolloff(sig)
    
    # Spectral rolloff a tante frequenze
    p = range(5,96,10) # percentuali
    for i in p:
        f['spec_rolloff_' + str(i) + '_perc'] = spectral_rolloff(sig,i/100)
    
    
    
    # PERCENTILES
    
    p = range(5,96,10)
    
    # Time
    
    # Percentili del segnale
    for i in p:
        f['time_' + str(i) + '_perc'] = np.percentile(sig,i)
        
    # Percentili della derivata del segnale
    for i in p:
        f['time_der_' + str(i) + '_perc'] = np.percentile(der(sig),i)

    # Frequency (no filtraggio o autocorr)
    for i in p:
        f['freq_' + str(i) + '_perc'] = np.percentile(power,i)
    
    # Frequency: percentili della derivata della potenza
    for i in p:
        f['freq_der_' + str(i) + '_perc'] = np.percentile(der(power),i)
        
    return f
In [63]:
extract_features(X_d[0])
Out[63]:
{'length': 0.1895,
 'avg_amp': -0.0001862261481396496,
 'stddev_amp': 0.3262283575398377,
 'max_amp': 1.0,
 'min_amp': -0.7920772946859903,
 'avg_abs_amp': 0.2441028386422444,
 'stddev_abs_amp': 0.21642268852899135,
 'max_abs_amp': 1.0,
 'min_abs_amp': 9.66183574879227e-05,
 'sig_kurtosis': 0.5266601217598406,
 'sig_skew': 0.4722378128229826,
 'sig_linreg_slope': 0.007641790098479099,
 'sig_linreg_intercept': -0.0009098081480893892,
 'sig_linreg_r_value': 0.0012814211615631524,
 'sig_linreg_p_value': 0.9602402633801186,
 'sig_linreg_std_err': 0.15326391626329075,
 'n_gob': 1,
 'min_dist_gob': 0.0,
 'best_gob': 0.0625,
 'zero_crossing_rate': 201,
 'rise_time': 0.08737500000000001,
 'ener_center': 0.064625,
 'ener_5_perc': 0.013000000000000001,
 'ener_15_perc': 0.026375,
 'ener_25_perc': 0.039125,
 'ener_35_perc': 0.050875000000000004,
 'ener_45_perc': 0.060125,
 'ener_55_perc': 0.069,
 'ener_65_perc': 0.07875,
 'ener_75_perc': 0.0885,
 'ener_85_perc': 0.099875,
 'ener_95_perc': 0.128875,
 'ener_dist_1': 0.11573055924123113,
 'ener_dist_2': 0.1573694984462106,
 'ener_dist_3': 0.2177713334824703,
 'ener_dist_4': 0.2151211889955124,
 'ener_dist_5': 0.1859986014326844,
 'ener_dist_6': 0.053830810255449175,
 'ener_dist_7': 0.028617787124102033,
 'ener_dist_8': 0.01689520972075151,
 'ener_dist_9': 0.008632566852868477,
 'ener_dist_10': 5.0654126496654195e-05,
 'top_slots': 0.5,
 'f_avg_power': 4.136617718495262,
 'f_stddev_power': 12.009525730775067,
 'f_min_power': 0.0027700698644093687,
 'f_max_power': 138.0644944783284,
 'f_power_kurtosis': 43.60969235154188,
 'f_power_skew': 5.762864993529072,
 'f_power_linreg_slope': -0.004187071650565248,
 'f_power_linreg_intercept': 12.49971333717044,
 'f_power_linreg_r_value': -0.4025812334431007,
 'f_power_linreg_p_value': 6.713839247425705e-31,
 'f_power_linreg_std_err': 0.00034625767706640103,
 'spec_centroid': 527.0715538297464,
 'stddev_spec': 557.2339078239443,
 'spec_rolloff': 865.4353562005276,
 'spec_rolloff_5_perc': 374.6701846965699,
 'spec_rolloff_15_perc': 416.88654353562004,
 'spec_rolloff_25_perc': 432.7176781002638,
 'spec_rolloff_35_perc': 437.9947229551451,
 'spec_rolloff_45_perc': 437.9947229551451,
 'spec_rolloff_55_perc': 443.27176781002635,
 'spec_rolloff_65_perc': 506.59630606860156,
 'spec_rolloff_75_perc': 796.8337730870712,
 'spec_rolloff_85_perc': 860.1583113456464,
 'spec_rolloff_95_perc': 870.7124010554089,
 'time_5_perc': -0.5363043478260869,
 'time_15_perc': -0.2858454106280193,
 'time_25_perc': -0.19485507246376813,
 'time_35_perc': -0.12135265700483092,
 'time_45_perc': -0.06111111111111111,
 'time_55_perc': -0.002608695652173869,
 'time_65_perc': 0.06879227053140097,
 'time_75_perc': 0.14893719806763284,
 'time_85_perc': 0.3058937198067633,
 'time_95_perc': 0.6501932367149759,
 'time_der_5_perc': -2427.2850241545893,
 'time_der_15_perc': -868.7149758454107,
 'time_der_25_perc': -461.44927536231876,
 'time_der_35_perc': -255.14975845410606,
 'time_der_45_perc': -67.0144927536231,
 'time_der_55_perc': 186.0483091787439,
 'time_der_65_perc': 401.159420289855,
 'time_der_75_perc': 624.9275362318841,
 'time_der_85_perc': 1090.4734299516906,
 'time_der_95_perc': 1861.9516908212554,
 'freq_5_perc': 0.047462454208777614,
 'freq_15_perc': 0.07827625997485124,
 'freq_25_perc': 0.12381739867194586,
 'freq_35_perc': 0.17453474578622746,
 'freq_45_perc': 0.2305991603376309,
 'freq_55_perc': 0.3083385284057002,
 'freq_65_perc': 0.4884881587814418,
 'freq_75_perc': 1.4168702992147209,
 'freq_85_perc': 6.754951464651299,
 'freq_95_perc': 23.100250091381763,
 'freq_der_5_perc': -21055.422677187704,
 'freq_der_15_perc': -2055.4939795599116,
 'freq_der_25_perc': -937.1624124614004,
 'freq_der_35_perc': -405.10243473313244,
 'freq_der_45_perc': -75.7734099647774,
 'freq_der_55_perc': 143.74728258642472,
 'freq_der_65_perc': 527.5100249508557,
 'freq_der_75_perc': 1050.8606715503422,
 'freq_der_85_perc': 2806.6428026537837,
 'freq_der_95_perc': 24708.13442534132}
In [64]:
import pandas as pd

X_df = pd.DataFrame([extract_features(X_d[i]) for i in range(X_d.size)])
X_df
Out[64]:
length avg_amp stddev_amp max_amp min_amp avg_abs_amp stddev_abs_amp max_abs_amp min_abs_amp sig_kurtosis ... freq_der_5_perc freq_der_15_perc freq_der_25_perc freq_der_35_perc freq_der_45_perc freq_der_55_perc freq_der_65_perc freq_der_75_perc freq_der_85_perc freq_der_95_perc
0 0.189500 -0.000186 0.326228 1.000000 -0.792077 0.244103 0.216423 1.0 0.000097 0.526660 ... -21055.422677 -2055.493980 -937.162412 -405.102435 -75.773410 143.747283 527.510025 1050.860672 2806.642803 24708.134425
1 0.282250 -0.000373 0.248947 0.737621 -1.000000 0.184893 0.166700 1.0 0.000000 0.803195 ... -43223.409852 -14142.569819 -5051.241636 -1756.667572 -402.046488 461.786717 2160.363739 5900.370529 13543.759659 46541.542152
2 0.290000 -0.030402 0.196933 0.666667 -1.000000 0.133822 0.147644 1.0 0.000000 3.640200 ... -18262.715930 -7469.825338 -4849.868531 -2487.645856 -891.107893 799.958063 2552.701267 4605.333361 7676.336632 19141.230634
3 0.147375 -0.000112 0.394771 1.000000 -0.942802 0.313718 0.239635 1.0 0.000048 -0.394984 ... -21007.261559 -4778.537229 -2575.101697 -1283.597310 -436.382260 289.591503 1188.906164 2461.395639 5217.667321 23308.016821
4 0.296750 -0.000332 0.246363 0.854836 -1.000000 0.181772 0.166294 1.0 0.000079 1.400172 ... -30448.844571 -9735.085301 -5291.529875 -2471.887612 -813.425266 617.312972 2653.925300 4837.829710 9877.176970 29596.582583
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1474 0.212000 -0.043548 0.301636 0.904762 -1.000000 0.222793 0.207952 1.0 0.000000 0.796990 ... -16320.454528 -7449.342171 -4710.088166 -2537.438929 -901.334635 917.746060 2439.563057 4387.331071 7036.760159 15037.273476
1475 0.293875 -0.081938 0.295101 0.818182 -1.000000 0.221685 0.211316 1.0 0.000000 0.917214 ... -39827.581208 -21971.784726 -13756.301054 -7851.838722 -2600.875102 2515.583639 8021.384329 14569.917146 22095.967732 39172.293680
1476 0.111500 -0.029073 0.293233 0.900000 -1.000000 0.219357 0.196757 1.0 0.000000 0.653792 ... -15284.664091 -5832.066651 -3529.188778 -1627.169020 -634.972260 741.731305 1974.364759 3661.307942 7265.531369 20628.912592
1477 0.168000 -0.000263 0.253556 1.000000 -0.604720 0.180689 0.177882 1.0 0.000000 1.207786 ... -21960.712713 -7575.734178 -3558.639528 -1873.496489 -546.406182 541.849172 1844.050730 3727.056299 8286.865934 23205.194394
1478 0.191125 -0.000013 0.248714 0.629208 -1.000000 0.177694 0.174021 1.0 0.000000 1.549279 ... -29237.907283 -12713.644275 -5842.696542 -2823.553985 -785.648045 873.799674 2200.174142 5666.737911 11607.555248 28459.373033

1479 rows × 107 columns

In [65]:
X_df.describe()
Out[65]:
length avg_amp stddev_amp max_amp min_amp avg_abs_amp stddev_abs_amp max_abs_amp min_abs_amp sig_kurtosis ... freq_der_5_perc freq_der_15_perc freq_der_25_perc freq_der_35_perc freq_der_45_perc freq_der_55_perc freq_der_65_perc freq_der_75_perc freq_der_85_perc freq_der_95_perc
count 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 ... 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000 1479.000000
mean 0.279859 -0.008695 0.240222 0.815043 -0.951274 0.173580 0.165926 0.999950 0.000061 2.572152 ... -31749.524235 -10091.017599 -5098.919807 -2504.686502 -740.142495 711.033977 2461.458223 5040.878363 9975.775114 31437.213391
std 0.105590 0.016900 0.058821 0.169961 0.094981 0.054193 0.030486 0.001907 0.000187 2.800285 ... 10100.377971 3767.215203 2094.240422 1110.733655 386.982664 370.629145 1109.820565 2093.157212 3770.536907 10225.608810
min 0.066750 -0.147195 0.102033 0.313577 -1.000092 0.050684 0.077023 0.926667 0.000000 -0.824186 ... -74187.239940 -31540.935625 -17932.476658 -9890.842646 -3955.985055 0.588956 192.148764 507.939396 1559.323714 8003.477907
25% 0.199875 -0.013110 0.192443 0.682699 -1.000000 0.129886 0.142336 1.000000 0.000000 0.559730 ... -37633.150126 -12120.381442 -6220.393327 -3125.642331 -925.018925 443.665313 1711.067822 3648.359979 7568.791921 24248.912070
50% 0.270875 -0.000258 0.235955 0.827586 -1.000000 0.169506 0.165315 1.000000 0.000000 1.628353 ... -30204.298753 -9610.590471 -4931.277544 -2419.718407 -684.157223 661.275776 2378.322479 4845.850223 9461.160623 30045.033385
75% 0.354812 0.000093 0.283074 1.000000 -0.948879 0.212246 0.188109 1.000000 0.000000 3.925444 ... -24528.490163 -7626.004068 -3680.319427 -1745.971714 -468.452129 920.378173 3054.484666 6222.666425 11954.500940 36877.871866
max 0.561375 0.003842 0.430772 1.000000 -0.485512 0.360998 0.256188 1.000092 0.001883 21.613519 ... -7383.099005 -1650.820822 -530.356159 -187.762566 -40.441810 2699.125389 10056.514404 18935.254253 31675.024781 74256.926324

8 rows × 107 columns

In [66]:
X_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1479 entries, 0 to 1478
Columns: 107 entries, length to freq_der_95_perc
dtypes: float64(105), int64(2)
memory usage: 1.2 MB

Features correlation

In [67]:
import seaborn as sns; sns.set()

def plot_heatmap(X_df):
    #compute the correlation matrix
    correlation_matrix = X_df.corr()
    feature_names = X_df.columns

    # Generate a mask for the upper triangle
    mask = np.zeros_like(correlation_matrix, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(15, 15))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    ax = sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=1, vmin = -1, center=0,
                square=True, linewidths=.5);

    #build the heatmap
    #ax = sns.heatmap(
    #    correlation_matrix, 
    #    vmin=-1, vmax=1, center=0,
    #    cmap=sns.diverging_palette(20, 220, n=200),
    #    square=True
    #)

    ax.set_xticklabels(
        feature_names,
        rotation=45,
        horizontalalignment='right'
    );
In [68]:
plot_heatmap(X_df)

Riduzione features fortemente correlate

In [69]:
print('All the strongly correlated features:\n')

threshold = .85

correlation_matrix = X_df.corr()
feature_names = X_df.columns

# Lista di features correlate tra loro
correlated = []

for i in range(len(correlation_matrix)):
    for j in range(0,i):
        if correlation_matrix.iloc[i,j] > threshold or correlation_matrix.iloc[i,j] < -threshold:
            print(f'({feature_names[i]}, {feature_names[j]}) = {correlation_matrix.iloc[i,j]:.3f}')
            
            # Connessione
            correlated.append([feature_names[i], feature_names[j]])
All the strongly correlated features:

(avg_abs_amp, stddev_amp) = 0.988
(stddev_abs_amp, stddev_amp) = 0.955
(stddev_abs_amp, avg_abs_amp) = 0.901
(sig_skew, max_amp) = 0.856
(sig_linreg_intercept, avg_amp) = 0.995
(rise_time, length) = 0.859
(ener_center, best_gob) = 0.903
(ener_15_perc, best_gob) = 0.861
(ener_15_perc, ener_center) = 0.908
(ener_15_perc, ener_5_perc) = 0.934
(ener_25_perc, best_gob) = 0.901
(ener_25_perc, ener_center) = 0.953
(ener_25_perc, ener_5_perc) = 0.887
(ener_25_perc, ener_15_perc) = 0.981
(ener_35_perc, best_gob) = 0.910
(ener_35_perc, ener_center) = 0.979
(ener_35_perc, ener_5_perc) = 0.850
(ener_35_perc, ener_15_perc) = 0.952
(ener_35_perc, ener_25_perc) = 0.987
(ener_45_perc, best_gob) = 0.911
(ener_45_perc, ener_center) = 0.998
(ener_45_perc, ener_15_perc) = 0.922
(ener_45_perc, ener_25_perc) = 0.965
(ener_45_perc, ener_35_perc) = 0.988
(ener_55_perc, best_gob) = 0.892
(ener_55_perc, ener_center) = 0.998
(ener_55_perc, ener_15_perc) = 0.896
(ener_55_perc, ener_25_perc) = 0.943
(ener_55_perc, ener_35_perc) = 0.972
(ener_55_perc, ener_45_perc) = 0.993
(ener_65_perc, best_gob) = 0.860
(ener_65_perc, ener_center) = 0.986
(ener_65_perc, ener_15_perc) = 0.868
(ener_65_perc, ener_25_perc) = 0.919
(ener_65_perc, ener_35_perc) = 0.952
(ener_65_perc, ener_45_perc) = 0.977
(ener_65_perc, ener_55_perc) = 0.992
(ener_75_perc, length) = 0.856
(ener_75_perc, rise_time) = 0.865
(ener_75_perc, ener_center) = 0.960
(ener_75_perc, ener_25_perc) = 0.884
(ener_75_perc, ener_35_perc) = 0.921
(ener_75_perc, ener_45_perc) = 0.949
(ener_75_perc, ener_55_perc) = 0.969
(ener_75_perc, ener_65_perc) = 0.988
(ener_85_perc, length) = 0.889
(ener_85_perc, rise_time) = 0.922
(ener_85_perc, ener_center) = 0.922
(ener_85_perc, ener_35_perc) = 0.881
(ener_85_perc, ener_45_perc) = 0.909
(ener_85_perc, ener_55_perc) = 0.934
(ener_85_perc, ener_65_perc) = 0.957
(ener_85_perc, ener_75_perc) = 0.981
(ener_95_perc, length) = 0.942
(ener_95_perc, rise_time) = 0.933
(ener_95_perc, ener_center) = 0.877
(ener_95_perc, ener_45_perc) = 0.865
(ener_95_perc, ener_55_perc) = 0.889
(ener_95_perc, ener_65_perc) = 0.914
(ener_95_perc, ener_75_perc) = 0.941
(ener_95_perc, ener_85_perc) = 0.973
(f_max_power, f_stddev_power) = 0.871
(f_power_skew, f_power_kurtosis) = 0.960
(f_power_linreg_intercept, f_power_linreg_slope) = -0.964
(f_power_linreg_std_err, stddev_amp) = 0.974
(f_power_linreg_std_err, avg_abs_amp) = 0.969
(f_power_linreg_std_err, stddev_abs_amp) = 0.926
(stddev_spec, spec_centroid) = 0.960
(spec_rolloff_25_perc, spec_rolloff_15_perc) = 0.881
(spec_rolloff_35_perc, spec_centroid) = 0.875
(spec_rolloff_35_perc, spec_rolloff_25_perc) = 0.901
(spec_rolloff_45_perc, spec_centroid) = 0.890
(spec_rolloff_45_perc, spec_rolloff_35_perc) = 0.933
(spec_rolloff_55_perc, spec_centroid) = 0.859
(spec_rolloff_55_perc, spec_rolloff_45_perc) = 0.889
(spec_rolloff_85_perc, spec_rolloff) = 0.866
(time_5_perc, stddev_amp) = -0.929
(time_5_perc, avg_abs_amp) = -0.910
(time_5_perc, stddev_abs_amp) = -0.920
(time_5_perc, f_power_linreg_std_err) = -0.911
(time_15_perc, stddev_amp) = -0.949
(time_15_perc, avg_abs_amp) = -0.969
(time_15_perc, stddev_abs_amp) = -0.865
(time_15_perc, f_power_linreg_std_err) = -0.943
(time_15_perc, time_5_perc) = 0.908
(time_25_perc, stddev_amp) = -0.898
(time_25_perc, avg_abs_amp) = -0.936
(time_25_perc, f_power_linreg_std_err) = -0.905
(time_25_perc, time_15_perc) = 0.948
(time_35_perc, time_25_perc) = 0.922
(time_45_perc, time_35_perc) = 0.855
(time_75_perc, avg_abs_amp) = 0.868
(time_75_perc, time_65_perc) = 0.932
(time_85_perc, stddev_amp) = 0.934
(time_85_perc, avg_abs_amp) = 0.955
(time_85_perc, f_power_linreg_std_err) = 0.903
(time_85_perc, time_15_perc) = -0.893
(time_85_perc, time_25_perc) = -0.856
(time_85_perc, time_75_perc) = 0.933
(time_95_perc, stddev_amp) = 0.944
(time_95_perc, avg_abs_amp) = 0.924
(time_95_perc, stddev_abs_amp) = 0.908
(time_95_perc, f_power_linreg_std_err) = 0.915
(time_95_perc, time_15_perc) = -0.866
(time_95_perc, time_85_perc) = 0.898
(time_der_25_perc, time_der_15_perc) = 0.884
(time_der_65_perc, time_der_35_perc) = -0.933
(time_der_75_perc, time_der_15_perc) = -0.870
(time_der_75_perc, time_der_25_perc) = -0.931
(time_der_85_perc, time_der_15_perc) = -0.964
(time_der_85_perc, time_der_25_perc) = -0.875
(time_der_85_perc, time_der_75_perc) = 0.891
(time_der_95_perc, time_der_5_perc) = -0.958
(freq_15_perc, freq_5_perc) = 0.937
(freq_25_perc, freq_5_perc) = 0.860
(freq_25_perc, freq_15_perc) = 0.973
(freq_35_perc, freq_15_perc) = 0.938
(freq_35_perc, freq_25_perc) = 0.989
(freq_45_perc, freq_15_perc) = 0.895
(freq_45_perc, freq_25_perc) = 0.962
(freq_45_perc, freq_35_perc) = 0.988
(freq_55_perc, freq_25_perc) = 0.913
(freq_55_perc, freq_35_perc) = 0.953
(freq_55_perc, freq_45_perc) = 0.985
(freq_65_perc, freq_35_perc) = 0.860
(freq_65_perc, freq_45_perc) = 0.911
(freq_65_perc, freq_55_perc) = 0.960
(freq_75_perc, freq_65_perc) = 0.877
(freq_der_15_perc, freq_45_perc) = -0.853
(freq_der_15_perc, freq_55_perc) = -0.876
(freq_der_15_perc, freq_65_perc) = -0.867
(freq_der_25_perc, freq_15_perc) = -0.878
(freq_der_25_perc, freq_25_perc) = -0.894
(freq_der_25_perc, freq_35_perc) = -0.894
(freq_der_25_perc, freq_45_perc) = -0.888
(freq_der_25_perc, freq_55_perc) = -0.869
(freq_der_25_perc, freq_der_15_perc) = 0.936
(freq_der_35_perc, freq_5_perc) = -0.881
(freq_der_35_perc, freq_15_perc) = -0.908
(freq_der_35_perc, freq_25_perc) = -0.886
(freq_der_35_perc, freq_35_perc) = -0.860
(freq_der_35_perc, freq_der_25_perc) = 0.960
(freq_der_45_perc, freq_der_35_perc) = 0.908
(freq_der_65_perc, freq_5_perc) = 0.872
(freq_der_65_perc, freq_15_perc) = 0.905
(freq_der_65_perc, freq_25_perc) = 0.883
(freq_der_65_perc, freq_35_perc) = 0.859
(freq_der_65_perc, freq_der_25_perc) = -0.943
(freq_der_65_perc, freq_der_35_perc) = -0.948
(freq_der_65_perc, freq_der_55_perc) = 0.915
(freq_der_75_perc, freq_15_perc) = 0.875
(freq_der_75_perc, freq_25_perc) = 0.893
(freq_der_75_perc, freq_35_perc) = 0.895
(freq_der_75_perc, freq_45_perc) = 0.891
(freq_der_75_perc, freq_55_perc) = 0.873
(freq_der_75_perc, freq_der_15_perc) = -0.932
(freq_der_75_perc, freq_der_25_perc) = -0.977
(freq_der_75_perc, freq_der_35_perc) = -0.937
(freq_der_75_perc, freq_der_65_perc) = 0.959
(freq_der_85_perc, freq_45_perc) = 0.853
(freq_der_85_perc, freq_55_perc) = 0.875
(freq_der_85_perc, freq_65_perc) = 0.864
(freq_der_85_perc, freq_der_15_perc) = -0.980
(freq_der_85_perc, freq_der_25_perc) = -0.926
(freq_der_85_perc, freq_der_75_perc) = 0.940
(freq_der_95_perc, freq_der_5_perc) = -0.948
In [70]:
import numpy as np

def componenti_connesse(connessioni):
    """
    Restituisce le componenti connesse a partire da una lista di connessioni
    connessioni:lista di liste
    """
    elem = []
    for c in connessioni:
        elem.append(c[0])
        elem.append(c[1])
        
    # Vettore di elementi: possono essere qualsiasi oggetto
    elem = list(set(elem))
    
    # Vettore delle connessioni
    v = np.array([i for i in range(len(elem))])
    
    for c in connessioni:
        # Hashing
        i = elem.index(c[0])
        j = elem.index(c[1])
        
        # Verifica connessione
        if v[i] != v[j]:
            
            # Creazione connessione
            b = v[j]
            a = v[i]
            for k in range(v.size):
                if v[k] == a:
                    v[k] = b
   
    # Estrazione componenti connesse
    cc = list(set(v))
    elem = np.array(elem)
    cc_el = []
    for c in cc:
        cc_el.append(elem[v == c].tolist())
    
    return cc_el
In [71]:
# Componenti connesse della correlazione tra features
# Non sto differenziando correlazione positiva e negativa. lo devo fare?
comp_conn = componenti_connesse(correlated)
comp_conn
Out[71]:
[['time_der_75_perc',
  'time_der_25_perc',
  'time_der_15_perc',
  'time_der_85_perc'],
 ['freq_45_perc',
  'freq_der_85_perc',
  'freq_55_perc',
  'freq_der_25_perc',
  'freq_75_perc',
  'freq_der_45_perc',
  'freq_der_15_perc',
  'freq_15_perc',
  'freq_35_perc',
  'freq_der_65_perc',
  'freq_5_perc',
  'freq_25_perc',
  'freq_65_perc',
  'freq_der_35_perc',
  'freq_der_75_perc',
  'freq_der_55_perc'],
 ['ener_25_perc',
  'ener_95_perc',
  'ener_35_perc',
  'ener_45_perc',
  'ener_65_perc',
  'ener_center',
  'rise_time',
  'ener_75_perc',
  'ener_85_perc',
  'best_gob',
  'length',
  'ener_55_perc',
  'ener_15_perc',
  'ener_5_perc'],
 ['sig_skew', 'max_amp'],
 ['f_max_power', 'f_stddev_power'],
 ['sig_linreg_intercept', 'avg_amp'],
 ['time_der_35_perc', 'time_der_65_perc'],
 ['freq_der_95_perc', 'freq_der_5_perc'],
 ['stddev_spec',
  'spec_rolloff_15_perc',
  'spec_rolloff_35_perc',
  'spec_centroid',
  'spec_rolloff_55_perc',
  'spec_rolloff_45_perc',
  'spec_rolloff_25_perc'],
 ['time_85_perc',
  'f_power_linreg_std_err',
  'time_75_perc',
  'time_65_perc',
  'time_5_perc',
  'time_35_perc',
  'stddev_abs_amp',
  'time_95_perc',
  'time_15_perc',
  'avg_abs_amp',
  'time_45_perc',
  'time_25_perc',
  'stddev_amp'],
 ['f_power_linreg_intercept', 'f_power_linreg_slope'],
 ['time_der_5_perc', 'time_der_95_perc'],
 ['f_power_skew', 'f_power_kurtosis'],
 ['spec_rolloff_85_perc', 'spec_rolloff']]

Per ogni componente connessa identifico un rappresentante: il più correlato all'interno della componente connessa e il meno correlato all'esterno (con le altre)

In [72]:
def rapp_feature(comp_conn, correlation_matrix):
    corr = correlation_matrix.copy()

    #  Rimuovo la correlazione di un indice con sè stesso
    for c in corr.columns:
        corr.loc[c,c] = 0

    # Faccio il modulo della correlazione: positiva e negativa non si devono cancellare a vicenda
    corr = np.abs(corr)

    # Correlazione totale di ciscuna feature con le altre
    somma_corr = corr.sum()

    rappr_features = []
    # Per ciasscuna comp_conn
    for cc in comp_conn:
        # Lo score viene calcolato come: correlazione_interna(comp conn)/correlazione_totale
        f, f_score = max([(feat, f_corr/somma_corr[feat]) for feat,f_corr in corr.loc[cc,cc].sum().items()],\
                         key=lambda x: x[1])

        rappr_features.append(f)
    
    return rappr_features

Funzione wrapper di tutto

In [73]:
def riduzione_correlate(X,threshold=.85):
    """Riceve un DataFrame X e ritorna X ridotto"""
    
    correlation_matrix = X.corr()
    feature_names = X.columns
    
    # Lista di features correlate tra loro
    correlated = []

    for i in range(len(correlation_matrix)):
        for j in range(i):
            if correlation_matrix.iloc[i,j] > threshold or correlation_matrix.iloc[i,j] < -threshold:
                # Connessione
                correlated.append([feature_names[i], feature_names[j]])
    
    # Componenti connesse della correlazione tra features
    # Non sto differenziando correlazione positiva e negativa. lo devo fare? Non credo
    comp_conn = componenti_connesse(correlated)
    
    # Feature rappresentante per ogni componente connessa
    best_feat = rapp_feature(comp_conn, correlation_matrix)
    
    # Rimozione delle features: tutte quelle delle componenti connesse meno il rappresentante
    bad_feat = []
    for i,cc in enumerate(comp_conn):
            bad_feat.extend(list(filter(lambda x: x != best_feat[i], cc)))
    
    # Drop delle colonne
    X_new = X.drop(columns=bad_feat) 
    
    return X_new
In [139]:
X_old = X_df.copy()
X_df = riduzione_correlate(X_old)
plot_heatmap(X_df)

Spiegazione

Alcune features possono essere fortemente correlate tra loro, il chè può avere effetti negativi sulle performanecs (o comunque non le migliora).

Riduzione - pipeline:

  1. Calcolo della matrice di correlazione con Pandas
  2. Estrazione delle coppie di features con correlazione superiore a una certa soglia (default 85%).
  3. Estrazione delle componenti connesse: si considerano le coppie come degli archi di un grafo. In questo modo ricavo le features che si 'muovono assieme'.
  4. Per ogni componente connessa estraggo un rappresentante. Esso è la feature maggiormente correlata all'interno della componente connessa e meno correlata con le altre features (esterne). Per fare questo prima rimuovo la diagonale della matrice di correlazione (che è tutta a 1 ed è inutile), poi faccio il modulo della stessa. Non voglio che sommando le correlazioni quelle positive si cancellino on quelle negative.

    Infine, lo score della feature è così calcolato:

    \begin{equation} \lambda = \frac{\sum_{j \in CC} corr_{j}}{\sum_{i=1}^{N_{feat}} corr_{i}} \end{equation}

    Ossia, per ogni feature di una componente connessa, è il rapporto tra la somma delle correlazioni che essa ha con le features della stessa componente connessa e la somma delle correlazioni che ha con tutte le features.

  5. Ogni componenete connessa viene ridotta al suo rappresentante.

Extraction of values

In [180]:
X = X_df.values
X.shape
Out[180]:
(1479, 49)

Discretization

In [179]:
from sklearn.preprocessing import KBinsDiscretizer

n_ft = X.shape[1]
est = KBinsDiscretizer(n_bins=[2 for i in range(n_ft)], encode='ordinal').fit(X)

# X = est.transform(X)
C:\Users\mette\Anaconda3\lib\site-packages\sklearn\preprocessing\_discretization.py:193: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 4 are removed. Consider decreasing the number of bins.
  'decreasing the number of bins.' % jj)
C:\Users\mette\Anaconda3\lib\site-packages\sklearn\preprocessing\_discretization.py:193: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 10 are removed. Consider decreasing the number of bins.
  'decreasing the number of bins.' % jj)
C:\Users\mette\Anaconda3\lib\site-packages\sklearn\preprocessing\_discretization.py:193: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 11 are removed. Consider decreasing the number of bins.
  'decreasing the number of bins.' % jj)
C:\Users\mette\Anaconda3\lib\site-packages\sklearn\preprocessing\_discretization.py:193: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 31 are removed. Consider decreasing the number of bins.
  'decreasing the number of bins.' % jj)

Data normalization

Fortemente condigliata per le SVM

In [182]:
from sklearn import preprocessing

# Min max normalization
min_max_scaler = preprocessing.MinMaxScaler()
X_mm = min_max_scaler.fit_transform(X)

# z-score normalization
z_scaler = preprocessing.StandardScaler()
X_z = z_scaler.fit_transform(X)

# X_z Sembra dare performances migliori (soprattutto x le SVM e log.regress):
X = X_z

Osservazioni sul preproc

Sia la discretization che la normalization hanno pompato le performances di tutti i classificatori (tranne le rnd forest, che sono rimaste circa uguali). Il fatto è che usando discretization + normalization si ottengono più o meno gli stessi miglioramenti ottenuti usando anche solo una delle due. Conviene comunque provare a metterle in cascata.

Il motivo di questo è che molti algoritmi di classificazione (SVM, KNN, Logistic regression...) non sono scale independent e necessitano che tutte le features vengano rappresentate con la stessa scala. Questo risultato si ottiene normalizzando, o discretizzando (con perdita di informazione).

Le random forest invece sono scale independent (?) perchè hanno dato gli stessi risultati indipendentemente dalla normalizzazione.

"Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1." [Fonte: https://scikit-learn.org/stable/modules/svm.html]

Classification

In [183]:
import sklearn
In [184]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=.2)

Random Forest

In [185]:
from sklearn.ensemble import RandomForestClassifier

# Hanno beneficiato (poco poco) della data discretization

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train,y_train)

# Performance assessment
from sklearn.metrics import classification_report

y_p = rf.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        32
           1       0.74      0.88      0.81        26
           2       0.83      0.69      0.75        29
           3       0.74      0.83      0.78        35
           4       1.00      0.94      0.97        31
           5       1.00      1.00      1.00        26
           6       0.87      0.93      0.90        28
           7       0.91      0.88      0.89        24
           8       0.92      0.85      0.88        39
           9       0.87      0.77      0.82        26

    accuracy                           0.86       296
   macro avg       0.87      0.87      0.87       296
weighted avg       0.87      0.86      0.87       296

Features importance
In [186]:
rf.feature_importances_
sorted([(f, imp) for f,imp in zip(feature_names, rf.feature_importances_)], key=lambda x: -x[1])
Out[186]:
[('sig_kurtosis', 0.061633623458154416),
 ('ener_dist_6', 0.05865814911474786),
 ('ener_dist_5', 0.05264168658929731),
 ('f_min_power', 0.05156493658790358),
 ('ener_dist_4', 0.04925972919320833),
 ('ener_dist_3', 0.048520648535352325),
 ('sig_linreg_intercept', 0.04314536228543698),
 ('ener_dist_2', 0.03799367241039359),
 ('best_gob', 0.03593475816257535),
 ('min_dist_gob', 0.03374480121429734),
 ('ener_75_perc', 0.03330565619742548),
 ('sig_linreg_p_value', 0.03045861675272958),
 ('sig_linreg_r_value', 0.027807026607158888),
 ('ener_85_perc', 0.025891638917717934),
 ('f_max_power', 0.02328128903853513),
 ('avg_abs_amp', 0.023244360125636108),
 ('sig_linreg_std_err', 0.02179128090998255),
 ('ener_35_perc', 0.019876752588097367),
 ('ener_dist_10', 0.01762014605374807),
 ('f_stddev_power', 0.017366713293536836),
 ('ener_dist_1', 0.017182807078557363),
 ('ener_dist_8', 0.01692970747312912),
 ('ener_dist_9', 0.016524500038025206),
 ('ener_55_perc', 0.015576045429659959),
 ('f_power_skew', 0.015186150386581802),
 ('f_avg_power', 0.013618914679915974),
 ('ener_dist_7', 0.013549471093641916),
 ('avg_amp', 0.013393265756538842),
 ('f_power_kurtosis', 0.012945183290135444),
 ('zero_crossing_rate', 0.012843960557722934),
 ('ener_65_perc', 0.012349678139330208),
 ('top_slots', 0.011010533104871564),
 ('ener_5_perc', 0.010800026518576832),
 ('length', 0.010626483492516839),
 ('stddev_abs_amp', 0.010042294664267908),
 ('rise_time', 0.009763804896732198),
 ('n_gob', 0.009652802147100773),
 ('ener_center', 0.009341179772666451),
 ('ener_15_perc', 0.008688565507973008),
 ('ener_45_perc', 0.008635768447720585),
 ('max_abs_amp', 0.007536576885265229),
 ('min_abs_amp', 0.007227942006074446),
 ('sig_linreg_slope', 0.005393388426181962),
 ('stddev_amp', 0.005141275600967444),
 ('ener_25_perc', 0.005034507548269791),
 ('min_amp', 0.003917700583237321),
 ('sig_skew', 0.00207894573671772),
 ('ener_95_perc', 0.0012503119246481315),
 ('max_amp', 1.7360777038026018e-05)]

Cross validation

In [187]:
from sklearn.model_selection import GridSearchCV

params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 20, 40],
    #'max_leaf_nodes': [None, 10, 15, 20],
    #'min_impurity_decrease': [0.0, 0.1, 0.01]
    'max_features': ['sqrt', 'log2', None]
}

clf = RandomForestClassifier(n_estimators=100)

# Adesso la divisione tra training set e validation set è ciclica e
# la rotazione e la valutazione della miglior configurazione viene
# gestita dalla classe GridSearchCV.
gridsearch = GridSearchCV(clf, params, scoring='accuracy', cv=5, iid=False)

# Il fit viene sempre fatto su {validation + training} set!!
gridsearch.fit(x_train,y_train)

# Best accuracy sul validation set
#print(gridsearch.best_score_)

# Test delle performances
y_p = gridsearch.best_estimator_.predict(x_test)
#acc = accuracy_score(y_test, y_p)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.81      0.94      0.87        32
           1       0.77      0.88      0.82        26
           2       0.73      0.55      0.63        29
           3       0.72      0.74      0.73        35
           4       0.91      0.94      0.92        31
           5       1.00      1.00      1.00        26
           6       0.84      0.93      0.88        28
           7       0.90      0.79      0.84        24
           8       0.84      0.79      0.82        39
           9       0.83      0.77      0.80        26

    accuracy                           0.83       296
   macro avg       0.83      0.83      0.83       296
weighted avg       0.83      0.83      0.83       296

In [188]:
best_params = gridsearch.best_params_
best_params
Out[188]:
{'criterion': 'gini', 'max_depth': None, 'max_features': 'log2'}
In [189]:
rf = RandomForestClassifier(n_estimators=1000, **best_params)
rf.fit(x_train,y_train)

y_p = rf.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        32
           1       0.74      0.88      0.81        26
           2       0.83      0.69      0.75        29
           3       0.73      0.77      0.75        35
           4       1.00      0.94      0.97        31
           5       1.00      1.00      1.00        26
           6       0.84      0.93      0.88        28
           7       0.91      0.88      0.89        24
           8       0.89      0.85      0.87        39
           9       0.87      0.77      0.82        26

    accuracy                           0.86       296
   macro avg       0.86      0.86      0.86       296
weighted avg       0.86      0.86      0.86       296

SVM

In [190]:
from sklearn.svm import SVC

# Hanno bisogno che il dataset sia z-normalizzato!! e funzionano meglio se discretizzato
svml = SVC(C=1, gamma='scale')
svml.fit(x_train, y_train)
y_p = svml.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.82      0.88      0.85        32
           1       0.79      0.85      0.81        26
           2       0.69      0.62      0.65        29
           3       0.60      0.71      0.65        35
           4       0.97      0.97      0.97        31
           5       0.96      1.00      0.98        26
           6       0.90      1.00      0.95        28
           7       0.95      0.83      0.89        24
           8       0.94      0.77      0.85        39
           9       0.96      0.88      0.92        26

    accuracy                           0.84       296
   macro avg       0.86      0.85      0.85       296
weighted avg       0.85      0.84      0.85       296

Bayesian classificators

Naive Bayes assumption:

\begin{equation} p(f_1,..., f_n|c) = \prod_{i=1}^n p(f_i|c) \end{equation}

Dove $f_i$ è la feature i-esima e $c$ è una classe.
$p(f_i|c)$ è la distribuzione di probabilità della feature i nella classe c.

In questo caso un Gaussian Naive Bayes o un CategoricalNB dovrebbe andare.

In [191]:
from sklearn.naive_bayes import GaussianNB #, CategoricalNB
In [192]:
gaussnb = GaussianNB()
gaussnb.fit(x_train, y_train)
y_p = gaussnb.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.62      0.88      0.73        32
           1       0.59      0.65      0.62        26
           2       0.50      0.03      0.06        29
           3       0.36      0.71      0.48        35
           4       0.85      0.71      0.77        31
           5       0.74      0.77      0.75        26
           6       0.85      0.79      0.81        28
           7       0.67      0.83      0.74        24
           8       0.78      0.36      0.49        39
           9       0.67      0.62      0.64        26

    accuracy                           0.62       296
   macro avg       0.66      0.63      0.61       296
weighted avg       0.66      0.62      0.60       296

In [193]:
#catenb = CategoricalNB()
#catenb.fit(x_train, y_train)
#y_p = catenb.predict(x_test)
#print(classification_report(y_test, y_p))

KNN

In [194]:
from sklearn.neighbors import KNeighborsClassifier

# FUnzionano meglio se il dataset è normalizzato!!
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_p = knn.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.80      0.88      0.84        32
           1       0.70      0.81      0.75        26
           2       0.50      0.62      0.55        29
           3       0.53      0.49      0.51        35
           4       0.93      0.87      0.90        31
           5       0.89      0.96      0.93        26
           6       0.78      0.64      0.71        28
           7       0.90      0.79      0.84        24
           8       0.79      0.69      0.74        39
           9       0.79      0.85      0.81        26

    accuracy                           0.75       296
   macro avg       0.76      0.76      0.76       296
weighted avg       0.76      0.75      0.75       296

Logistic regression

In [195]:
from sklearn.linear_model import LogisticRegression

#Funziona meglio se il dataset è normalizzato (meglio z-score)!
lrclass = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=100000)

lrclass.fit(x_train, y_train)
y_p = lrclass.predict(x_test)
print(classification_report(y_test, y_p))
              precision    recall  f1-score   support

           0       0.74      0.91      0.82        32
           1       0.74      0.77      0.75        26
           2       0.61      0.66      0.63        29
           3       0.67      0.51      0.58        35
           4       0.91      0.97      0.94        31
           5       0.96      0.96      0.96        26
           6       0.96      0.89      0.93        28
           7       0.91      0.88      0.89        24
           8       0.78      0.79      0.78        39
           9       0.83      0.77      0.80        26

    accuracy                           0.80       296
   macro avg       0.81      0.81      0.81       296
weighted avg       0.80      0.80      0.80       296